AI vs. non-AI jobs

Author

Yanyang He

Data Collection & Cleaning

import matplotlib.pyplot as plt
import plotly.express as px
import pandas as pd
'''
Download the Dataset:

gdown 1VNBTxArDMN2o9fJBDImaON6YUAyJGOU6
unzip lightcast_job_postings.zip
'''
df = pd.read_csv('lightcast_job_postings.csv')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72476 entries, 0 to 72475
Columns: 131 entries, ID to NAICS_2022_6_NAME
dtypes: bool(2), float64(11), int64(27), object(91)
memory usage: 71.5+ MB
df.dtypes
ID                        object
LAST_UPDATED_DATE         object
LAST_UPDATED_TIMESTAMP    object
DUPLICATES                 int64
POSTED                    object
                           ...  
NAICS_2022_4_NAME         object
NAICS_2022_5               int64
NAICS_2022_5_NAME         object
NAICS_2022_6               int64
NAICS_2022_6_NAME         object
Length: 131, dtype: object
df.head()
ID LAST_UPDATED_DATE LAST_UPDATED_TIMESTAMP DUPLICATES POSTED EXPIRED DURATION SOURCE_TYPES SOURCES URL ... NAICS_2022_2 NAICS_2022_2_NAME NAICS_2022_3 NAICS_2022_3_NAME NAICS_2022_4 NAICS_2022_4_NAME NAICS_2022_5 NAICS_2022_5_NAME NAICS_2022_6 NAICS_2022_6_NAME
0 1f57d95acf4dc67ed2819eb12f049f6a5c11782c 2024-09-06 2024-09-06 20:32:57.352 Z 0 2024-06-02 2024-06-08 6.0 [\n "Company"\n] [\n "brassring.com"\n] [\n "https://sjobs.brassring.com/TGnewUI/Sear... ... 44 Retail Trade 441 Motor Vehicle and Parts Dealers 4413 Automotive Parts, Accessories, and Tire Retailers 44133 Automotive Parts and Accessories Retailers 441330 Automotive Parts and Accessories Retailers
1 0cb072af26757b6c4ea9464472a50a443af681ac 2024-08-02 2024-08-02 17:08:58.838 Z 0 2024-06-02 2024-08-01 NaN [\n "Job Board"\n] [\n "maine.gov"\n] [\n "https://joblink.maine.gov/jobs/1085740"\n] ... 56 Administrative and Support and Waste Managemen... 561 Administrative and Support Services 5613 Employment Services 56132 Temporary Help Services 561320 Temporary Help Services
2 85318b12b3331fa490d32ad014379df01855c557 2024-09-06 2024-09-06 20:32:57.352 Z 1 2024-06-02 2024-07-07 35.0 [\n "Job Board"\n] [\n "dejobs.org"\n] [\n "https://dejobs.org/dallas-tx/data-analys... ... 52 Finance and Insurance 524 Insurance Carriers and Related Activities 5242 Agencies, Brokerages, and Other Insurance Rela... 52429 Other Insurance Related Activities 524291 Claims Adjusting
3 1b5c3941e54a1889ef4f8ae55b401a550708a310 2024-09-06 2024-09-06 20:32:57.352 Z 1 2024-06-02 2024-07-20 48.0 [\n "Job Board"\n] [\n "disabledperson.com",\n "dejobs.org"\n] [\n "https://www.disabledperson.com/jobs/5948... ... 52 Finance and Insurance 522 Credit Intermediation and Related Activities 5221 Depository Credit Intermediation 52211 Commercial Banking 522110 Commercial Banking
4 cb5ca25f02bdf25c13edfede7931508bfd9e858f 2024-06-19 2024-06-19 07:00:00.000 Z 0 2024-06-02 2024-06-17 15.0 [\n "FreeJobBoard"\n] [\n "craigslist.org"\n] [\n "https://modesto.craigslist.org/sls/77475... ... 99 Unclassified Industry 999 Unclassified Industry 9999 Unclassified Industry 99999 Unclassified Industry 999999 Unclassified Industry

5 rows × 131 columns

# remove duplicated rows
df = df.drop_duplicates(subset='ID')

# convert date columns to datetime format
df['LAST_UPDATED_DATE'] = pd.to_datetime(df['LAST_UPDATED_DATE'])
df['LAST_UPDATED_TIMESTAMP'] = pd.to_datetime(df['LAST_UPDATED_TIMESTAMP'])
df['POSTED'] = pd.to_datetime(df['POSTED'])
df['EXPIRED'] = pd.to_datetime(df['EXPIRED'])

# remove internship
df = df[df['IS_INTERNSHIP'] == False]

# remove non-full time jobs
df = df[df['EMPLOYMENT_TYPE_NAME'] == 'Full-time (> 32 hours)']

# remove remote/hybrid jobs, we want to focus on in-person jobs
df = df[df['REMOTE_TYPE_NAME'] == '[None]']

# salary
df['SALARY'] = df['SALARY'].fillna((df['SALARY_FROM'] + df['SALARY_TO']) / 2)  # mean if lower/upper bound avaiable
df['SALARY'] = df['SALARY'].fillna(df['SALARY_FROM'])  # loewr bound if no upper bound presented
df['SALARY'] = df['SALARY'].fillna(df['SALARY_TO'])  # upper bound if no lower bound presented
# identifying ai/non-ai jobs by keyword searching
keywords = ['AI', 'Artificial Intelligence', 'Machine Learning', 'Deep Learning',
            'Data Science', 'Data Analysis', 'Data Analyst', 'Data Analytics',
            'LLM', 'Language Model', 'NLP', 'Natural Language Processing',
            'Computer Vision']

match = lambda col: df[col].str.contains('|'.join(keywords), case=False, na=False)

df['AI_JOB'] = match('TITLE_NAME') \
             | match('SKILLS_NAME') \
             | match('SPECIALIZED_SKILLS_NAME') \
             | match('LIGHTCAST_SECTORS_NAME')
df['AI_JOB'].value_counts()
AI_JOB
True     32622
False    20272
Name: count, dtype: int64

Data Analysis & Visualization

Q2: AI-driven Job Growth vs. Job Displacement

df_grouped = df.groupby(['POSTED', 'AI_JOB']).size().reset_index(name='Job_Count')

px.line(df_grouped, x='POSTED', y='Job_Count', color='AI_JOB',
        title="AI vs. Non-AI Job Posted Over Time",
        labels={'POSTED_MONTH': 'Month', 'Job_Count': 'Number of Job Postings'},
        markers=True)
df_grouped = df.groupby(['AI_JOB', 'MIN_YEARS_EXPERIENCE']).size().reset_index(name='Job_Count')

px.bar(df_grouped, x='MIN_YEARS_EXPERIENCE', y='Job_Count', color='AI_JOB',
       title="AI vs. Non-AI Jobs by Minimum Years of Experience",
       labels={'MIN_YEARS_EXPERIENCE': 'Min Years of Experience', 'Job_Count': 'Number of Jobs'},
       barmode='group')
df_grouped = df[df['SALARY'].notna()].groupby(['AI_JOB', 'MIN_YEARS_EXPERIENCE'])['SALARY'].mean().reset_index()

px.bar(df_grouped, x=['AI_JOB', 'MIN_YEARS_EXPERIENCE'], y='SALARY',
       title="Average Salary: AI vs. Non-AI Jobs",
       labels={'AI_JOB': 'Job Type', 'SALARY': 'Average Salary'},
       color='AI_JOB')

Q3: AI Careers vs. Traditional Roles

import ast
df['SKILLS'] = df['SKILLS_NAME'].apply(ast.literal_eval)

ai_skills = df[df['AI_JOB']]['SKILLS'].explode().value_counts().head(20).reset_index()
ai_skills.columns = ['Skill', 'Count']

fig = px.bar(ai_skills, x='Skill', y='Count',
             title="Top AI Job Skills",
             labels={'Skill': 'Skill Name', 'Count': 'Frequency'},
             color='Skill')
fig.show()

traditional_skills = df[~df['AI_JOB']]['SKILLS'].explode().value_counts().head(20).reset_index()
traditional_skills.columns = ['Skill', 'Count']

fig = px.bar(traditional_skills, x='Skill', y='Count',
             title="Top Traditional Job Skills",
             labels={'Skill': 'Skill Name', 'Count': 'Frequency'},
             color='Skill')
fig.show()

Q4: Emerging AI Job

job_title_counts = df[df['AI_JOB']]['TITLE_NAME'].value_counts().head(20).reset_index()
job_title_counts.columns = ['Job_Title', 'Count']

fig = px.bar(top_ai_job_titles, x='Job_Title', y='Count',
             title="Top Emerging AI Job Titles",
             labels={'Job_Title': 'Job Title', 'Count': 'Frequency'},
             color='Job_Title')

fig.show()